In [1]:
import plotly.graph_objects as go
import plotly.io as pio

pio.templates["nike"] = go.layout.Template(
    # LAYOUT
    layout = {
        # Fonts
        # Note - 'family' must be a single string, NOT a list or dict!
        'title':
            {'font': {'family': 'HelveticaNeue-CondensedBold, Helvetica, Sans-serif',
                      'size':30,
                      'color': '#333'}
            },
        'font': {'family': 'Helvetica Neue, Helvetica, Sans-serif',
                      'size':12,
                      'color': '#333'},
        # Colorways
        'colorway': ["#2445ec", '#a4abab'],
        # Keep adding others as needed below
        #'hovermode': 'x unified'
    },
    # DATA
    data = {
        # Each graph object must be in a tuple or list for each trace
        'bar': [go.Bar(texttemplate = '%{value:$.2s}',
                       textposition='outside',
                       textfont={'family': 'Helvetica Neue, Helvetica, Sans-serif',
                                 'size': 20,
                                 'color': '#FFFFFF'
                                 })]
    }
)

title: Assignment 03 author:

  • name: Dominique Strong affiliations:
    • id: bu name: Boston University city: Boston state: MA number-sections: true date: today format: html: theme: cerulean toc: true toc-depth: 2 date-modified: today date-format: long execute: echo: false eval: false freeze: true


Data Importing and Preparation

In [2]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
from pyspark.sql import SparkSession
import re
import numpy as np
import plotly.graph_objects as go
from pyspark.sql.functions import col, split, explode, regexp_replace, transform, when
from pyspark.sql import functions as F
from pyspark.sql.functions import col, monotonically_increasing_id

np.random.seed(42)

pio.renderers.default = "notebook"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("./data/lightcast_job_postings.csv")
df.createOrReplaceTempView("job_postings")

# Show Schema and Sample Data
#print("---This is Diagnostic check, No need to print it in the final doc---")

#df.printSchema() # comment this line when rendering the submission
#df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/24 03:14:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/09/24 03:14:29 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
In [3]:
# Used to convert columns to float type
df = df.withColumn("SALARY", col("SALARY").cast("float")) \
    .withColumn("SALARY_FROM", col("SALARY_FROM").cast("float")) \
    .withColumn("SALARY_TO", col("SALARY_TO").cast("float")) \
    .withColumn("MIN_YEARS_EXPERIENCE", col("MIN_YEARS_EXPERIENCE").cast("float")) \
    .withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))
In [4]:
# Used to create Remote_Group column
df = df.withColumn(
    "Remote_Group",
    when(col("REMOTE_TYPE") == 3, "Hybrid")
    .when(col("REMOTE_TYPE") == 1, "Remote")
    .when(col("REMOTE_TYPE") == 2, "Onsite")
    .otherwise("Onsite")
)
In [5]:
# For question #3 - groups the education levels into broader categories
df = df.withColumn(
    "EDU_GROUP",
    when(col("MIN_EDULEVELS") == 99, "Associate or lower")
    .when(col("MIN_EDULEVELS") == 0, "Associate or lower")
    .when(col("MIN_EDULEVELS") == 1, "Associate or lower")
    .when(col("MIN_EDULEVELS") == 2, "Bachelor")
    .when(col("MIN_EDULEVELS") == 3, "Master's")
    .when(col("MIN_EDULEVELS") == 4, "PhD")
    .otherwise("Associate or lower")
)
In [6]:
# Computing the medians

def compute_median(spark_df, col_name):
    median_value = spark_df.approxQuantile(col_name, [0.5], 0.01)[0]
    return median_value

median_from = compute_median(df, "SALARY_FROM")
median_to = compute_median(df, "SALARY_TO")
median_salary = compute_median(df, "SALARY")

print(f'Medians: SALARY_FROM = {median_from}, SALARY_TO = {median_to}, SALARY = {median_salary}')
[Stage 4:>                                                          (0 + 1) / 1]
Medians: SALARY_FROM = 87295.0, SALARY_TO = 130042.0, SALARY = 115024.0
                                                                                
In [7]:
# Removes NAs from median
df = df.fillna({"SALARY_FROM": median_from,
              "SALARY_TO": median_to,
              "SALARY": median_salary})
In [8]:
df = df.withColumn("Average_Salary", (col("SALARY_FROM") + col("SALARY_TO")) / 2)

export_cols = ['EDUCATION_LEVELS_NAME',
               'REMOTE_TYPE_NAME',
               'EMPLOYMENT_TYPE_NAME',
               'MAX_YEARS_EXPERIENCE',
               'Average_Salary',
               'SALARY',
               'NAICS2_NAME',
               'EDU_GROUP',
               'REMOTE_GROUP',
               'LOT_V6_SPECIALIZED_OCCUPATION_NAME'
               ]

df_selected = df.select(export_cols)
In [9]:
pdf = df_selected.toPandas()
pdf.to_csv("./data/lightcast_cleaned.csv", index=False)
                                                                                
In [10]:
pdf = df_selected.filter(col("SALARY") > 0).select("NAICS2_NAME", "SALARY").toPandas()
                                                                                

1) Salary Distribution by Industry and Employment Type¶

In [12]:
pdf = df_selected.select("NAICS2_NAME", "SALARY").toPandas()
fig = px.box(
    pdf, x="NAICS2_NAME", y="SALARY", title="Salary Distribution by NAICS Name",
    color_discrete_sequence=['#2445ec'],
    points='outliers'
    )
fig.update_layout(template="nike", height=800, width=1100, margin=dict(b=100), xaxis=dict(tickangle=35, automargin=True), xaxis_title="NAICS Name", yaxis_title="Salary (USD)")
#rotate x axis labels
fig.update_xaxes(tickangle=45, tickfont=dict(size=12))
fig.show()
                                                                                

This graph illustrates how salaries vary by each NAIC, where Administrative, Support, Waste Management and Remediation services has the highest salaries compared to all other NAIC's at $500K. Conversely, Unclassified has the lowest salary available at roughly $15K.

While many NAIC's have longer ranges of salaries, such as Information, Public Admin, and Real Estate, NAIC's such as Retail Trade mostly consist of the median salary of $115K, therefore any salaries above or below that are considered outliers.


2) Salary Analysis by ONET Occupation Type (Bubble Chart)¶

In [13]:
salary_analysis = spark.sql("""
    
    SELECT
        LOT_OCCUPATION_NAME as `Occupation_Name`,
        PERCENTILE(SALARY, 0.5) as `Median_Salary`,
        count(*) as `Job_Postings`
    FROM job_postings
        GROUP BY 1
        ORDER BY 3 DESC

""")

salary_df = salary_analysis.toPandas()
                                                                                
In [14]:
salary_df.head()
Out[14]:
Occupation_Name Median_Salary Job_Postings
0 Data / Data Mining Analyst 95250.0 30057
1 Business Intelligence Analyst 125900.0 29445
2 Computer Systems Engineer / Architect 157600.0 8212
3 Business / Management Analyst 93650.0 4326
4 Clinical Analyst / Clinical Documentation and ... 89440.0 261
In [15]:
fig = px.scatter(
    salary_df,
    x="Occupation_Name",
    y="Median_Salary",
    size="Job_Postings",
    title="Median Salary by Lot Occupation Name",
    color='Job_Postings',
    labels={"Job_Postings": "Job Postings"},
    hover_data={"Occupation_Name": True, "Median_Salary": True, "Job_Postings": True},
    size_max=60,
    width=1100,
    height=600,
    color_continuous_scale='Plasma'
)
fig.update_layout(
    template="nike", 
    margin=dict(b=100), 
    xaxis=dict(tickangle=-30, automargin=True), 
    xaxis_title="Occupation Name", 
    yaxis_title="Median Salary (USD)")
fig.show()

This graph shows that Data Mining and Business Intelligence analysts have the highest number of job postings, where the former has a lower median salary compared to the former. Computer Systems Engineer however, has a moderate number of job postings at a higher median salary, whereas Business, Clinical, and Market Research Analyst have the lowest number of job postings at the lowest salaries.


3) Salary by Education Level¶

In [16]:
salary_edu_df = df_selected.filter((col("SALARY") > 0) & (col("MAX_YEARS_EXPERIENCE") > 0)).select("Average_Salary", "MAX_YEARS_EXPERIENCE",  "EDU_GROUP", "LOT_V6_SPECIALIZED_OCCUPATION_NAME").toPandas()

salary_edu_df["MAX YEARS EXPERIENCE"] = salary_edu_df["MAX_YEARS_EXPERIENCE"] + np.random.uniform(-0.2, 0.2, size=len(salary_edu_df))

fig = px.scatter(
    salary_edu_df, 
    x="MAX YEARS EXPERIENCE", 
    y="Average_Salary", 
    title="Experience vs Salary by Education Level",
    color='EDU_GROUP',
    labels={"EDU_GROUP": "Education Level"}
    )
fig.update_layout(
    template="nike", 
    height=600, 
    width=1100, 
    margin=dict(b=100),
    xaxis_title="Years of Experience",
    yaxis_title="Average Salary (USD)")
fig.update_traces(
    marker=dict(line=dict(width=1, color='dark grey'))
)
#rotate x axis labels
fig.update_xaxes(dtick=1, tickfont=dict(size=12))
fig.show()
                                                                                

This graph illustrates how salary varies by education level and experience - after close examination, it is clear that as the years of experience increases, so does the average salary. For example, for all educations levels, when their is more than 3 years of experience, the average salary tends to fall somewhere between $100K and $200K; conversely 1 year of experience average salaries tend to fall more below $100K. Interestingly, a Bachelor degree seems to dominate more salaries in the 6+ years of experience.


4) Salary by Remote Work Type¶

In [17]:
Remote_df = df_selected.filter((col("SALARY") > 0) & (col("MAX_YEARS_EXPERIENCE") > 0)).select("Average_Salary", "MAX_YEARS_EXPERIENCE",  "EDU_GROUP", "LOT_V6_SPECIALIZED_OCCUPATION_NAME", "Remote_Group").toPandas()


fig = px.scatter(
    Remote_df, 
    x="MAX_YEARS_EXPERIENCE", 
    y="Average_Salary", 
    title="Experience vs Salary by Remote Work Type",
    color='Remote_Group',
    labels={"Remote_Group": "Remote Work Type"},
    )
fig.update_layout(
    template="nike", 
    height=600, 
    width=1100, 
    margin=dict(b=100),
    xaxis_title="Years of Experience",
    yaxis_title="Average Salary (USD)")
fig.update_traces(
    marker=dict(line=dict(width=1, color='dark grey'))
)
#rotate x axis labels
fig.update_xaxes(dtick=1, tickfont=dict(size=12))
fig.show()
                                                                                

Similar to the prior graph, this graph compares average salaries across years of experience but now with the type of work situation: onsite, remote, or hybrid. One key callout, the highest average salaries for most (if not all) years of experience is when the work is onsite - onsite workers tend to get paid higher on average compared to remote, whereas hybrid salaries fluctuate the most.